{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. 对数据做数据探索分析（可参考0_EDA_ diabetes.ipynb，不计分） <br/>\n",
    "2. 适当的特征工程（可参考1_FE_ diabetes.ipynb，不计分）  <br/>\n",
    "3. 采用5折交叉验证，分别用log似然损失和正确率，对Logistic回归模型的正则超参数调优。（各50分）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### import工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 768 entries, 0 to 767\n",
      "Data columns (total 9 columns):\n",
      " #   Column                        Non-Null Count  Dtype  \n",
      "---  ------                        --------------  -----  \n",
      " 0   pregnants                     768 non-null    int64  \n",
      " 1   Plasma_glucose_concentration  768 non-null    int64  \n",
      " 2   blood_pressure                768 non-null    int64  \n",
      " 3   Triceps_skin_fold_thickness   768 non-null    int64  \n",
      " 4   serum_insulin                 768 non-null    int64  \n",
      " 5   BMI                           768 non-null    float64\n",
      " 6   Diabetes_pedigree_function    768 non-null    float64\n",
      " 7   Age                           768 non-null    int64  \n",
      " 8   Target                        768 non-null    int64  \n",
      "dtypes: float64(2), int64(7)\n",
      "memory usage: 54.1 KB\n"
     ]
    }
   ],
   "source": [
    "# 导入工具库\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sn\n",
    "\n",
    "# 读入数据\n",
    "train=pd.read_csv('pima-indians-diabetes.csv')\n",
    "train.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 数据准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 数据分离\n",
    "y=train['Target']\n",
    "X=train.drop(['Target'],axis=1)\n",
    "feat_names=X.columns\n",
    "\n",
    "# 数据标准化\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# 初始化特征的标准化器\n",
    "ss_X = StandardScaler()\n",
    "\n",
    "# 分别对训练和测试数据的特征进行标准化处理\n",
    "X = ss_X.fit_transform(X)\n",
    "\n",
    "X_train=pd.DataFrame(columns=feat_names,data=X)\n",
    "train=pd.concat([X_train,y],axis=1)\n",
    "train.to_csv('FE_pima-indians-diabetes.csv',index = False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.639947</td>\n",
       "      <td>0.848324</td>\n",
       "      <td>0.149641</td>\n",
       "      <td>0.907270</td>\n",
       "      <td>-0.692891</td>\n",
       "      <td>0.204013</td>\n",
       "      <td>0.468492</td>\n",
       "      <td>1.425995</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-1.123396</td>\n",
       "      <td>-0.160546</td>\n",
       "      <td>0.530902</td>\n",
       "      <td>-0.692891</td>\n",
       "      <td>-0.684422</td>\n",
       "      <td>-0.365061</td>\n",
       "      <td>-0.190672</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.233880</td>\n",
       "      <td>1.943724</td>\n",
       "      <td>-0.263941</td>\n",
       "      <td>-1.288212</td>\n",
       "      <td>-0.692891</td>\n",
       "      <td>-1.103255</td>\n",
       "      <td>0.604397</td>\n",
       "      <td>-0.105584</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.844885</td>\n",
       "      <td>-0.998208</td>\n",
       "      <td>-0.160546</td>\n",
       "      <td>0.154533</td>\n",
       "      <td>0.123302</td>\n",
       "      <td>-0.494043</td>\n",
       "      <td>-0.920763</td>\n",
       "      <td>-1.041549</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-1.141852</td>\n",
       "      <td>0.504055</td>\n",
       "      <td>-1.504687</td>\n",
       "      <td>0.907270</td>\n",
       "      <td>0.765836</td>\n",
       "      <td>1.409746</td>\n",
       "      <td>5.484909</td>\n",
       "      <td>-0.020496</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0   0.639947                      0.848324        0.149641   \n",
       "1  -0.844885                     -1.123396       -0.160546   \n",
       "2   1.233880                      1.943724       -0.263941   \n",
       "3  -0.844885                     -0.998208       -0.160546   \n",
       "4  -1.141852                      0.504055       -1.504687   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin       BMI  \\\n",
       "0                     0.907270      -0.692891  0.204013   \n",
       "1                     0.530902      -0.692891 -0.684422   \n",
       "2                    -1.288212      -0.692891 -1.103255   \n",
       "3                     0.154533       0.123302 -0.494043   \n",
       "4                     0.907270       0.765836  1.409746   \n",
       "\n",
       "   Diabetes_pedigree_function       Age  Target  \n",
       "0                    0.468492  1.425995       1  \n",
       "1                   -0.365061 -0.190672       0  \n",
       "2                    0.604397 -0.105584       1  \n",
       "3                   -0.920763 -1.041549       0  \n",
       "4                    5.484909 -0.020496       1  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train=pd.read_csv('FE_pima-indians-diabetes.csv')\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "y=train['Target']\n",
    "X=train.drop(['Target'],axis=1)\n",
    "\n",
    "# 数据随机划分\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5折交叉验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "logloss of each fold is:  [0.5246369  0.47351113 0.51391346 0.4854342  0.40617739]\n",
      "cv logloss is: 0.48073461715735855\n"
     ]
    }
   ],
   "source": [
    "# 交叉验证用于评估模型性能和进行参数调优（模型选择）\n",
    "#分类任务中交叉验证缺省是采用StratifiedKFold\n",
    "#数据集比较大，采用5折交叉验证\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "loss = cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_log_loss')\n",
    "\n",
    "print ('logloss of each fold is: ',-loss)\n",
    "print ('cv logloss is:', -loss.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy of each fold is:  [0.76422764 0.75609756 0.76422764 0.78861789 0.79508197]\n",
      "cv accuracy is: 0.7736505397840864\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "loss = cross_val_score(lr, X_train, y_train, cv=5, scoring='accuracy')\n",
    "\n",
    "print ('accuracy of each fold is: ',loss)\n",
    "print ('cv accuracy is:', loss.mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Logistic回归"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "log似然损失/GridSearchCV调优： 0.476918484421001 {'C': 1, 'penalty': 'l2'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n",
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n",
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n",
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n",
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n",
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n",
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "# 模型实例化\n",
    "lr=LogisticRegression()\n",
    "\n",
    "#需要调优的参数              }\n",
    "penaltys = ['l1','l2']\n",
    "Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]\n",
    "tuned_parameters = dict(penalty = penaltys, C = Cs)\n",
    "\n",
    "# 参数调优\n",
    "grid= GridSearchCV(lr, tuned_parameters,cv=3, scoring='neg_log_loss')\n",
    "grid.fit(X_train,y_train)\n",
    "\n",
    "# examine the best model\n",
    "print('log似然损失/GridSearchCV调优：',-grid.best_score_,grid.best_params_)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy/GridSearchCV调优： 0.7752989000478241 {'C': 1, 'penalty': 'l2'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n",
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n",
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n",
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n",
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n",
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n",
      "C:\\Users\\xzz96\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: \n",
      "ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.\n",
      "\n",
      "  FitFailedWarning)\n"
     ]
    }
   ],
   "source": [
    "# 参数调优\n",
    "grid= GridSearchCV(lr, tuned_parameters,cv=3, scoring='accuracy')\n",
    "grid.fit(X_train,y_train)\n",
    "\n",
    "# examine the best model\n",
    "print('accuracy/GridSearchCV调优：',grid.best_score_,grid.best_params_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "log似然损失/LogisticRegression调优：\n",
      "best_C: 1\n",
      "score: 0.4806634617628206\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegressionCV\n",
    "\n",
    "Cs = [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]\n",
    "\n",
    "# 大量样本（6W+）、高维度（93），L1正则 --> 可选用saga优化求解器(0.19版本新功能)\n",
    "# LogisticRegressionCV比GridSearchCV快\n",
    "lrcv_L2 = LogisticRegressionCV(Cs=Cs, cv = 5, scoring='neg_log_loss', penalty='l2', solver='liblinear', multi_class='ovr')\n",
    "lrcv_L2.fit(X_train, y_train)    \n",
    "#lrcv_L2.scores_\n",
    "mse_mean=-np.mean(lrcv_L2.scores_[1],axis=0)\n",
    "#print('mse_mean:',mse_mean)\n",
    "best_C = np.argmin(mse_mean)\n",
    "best_score = np.min(mse_mean)\n",
    "print('log似然损失/LogisticRegression调优：')\n",
    "print ('best_C:',Cs[best_C])\n",
    "print('score:',best_score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy/LogisticRegression调优：\n",
      "best_C: 0.001\n",
      "score: 0.742782886845262\n"
     ]
    }
   ],
   "source": [
    "lrcv_L2 = LogisticRegressionCV(Cs=Cs, cv = 5, scoring='accuracy', penalty='l2', solver='liblinear', multi_class='ovr')\n",
    "lrcv_L2.fit(X_train, y_train)    \n",
    "#lrcv_L2.scores_\n",
    "mse_mean=np.mean(lrcv_L2.scores_[1],axis=0)\n",
    "#print('mse_mean:',mse_mean)\n",
    "best_C = np.argmin(mse_mean)\n",
    "best_score = np.min(mse_mean)\n",
    "print('accuracy/LogisticRegression调优：')\n",
    "print ('best_C:',Cs[best_C])\n",
    "print('score:',best_score)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
